p +
geom_smooth(method = "lm", se = FALSE)p +
geom_smooth(se = FALSE)library(gapminder)
ggplot(data = gapminder,
mapping = aes(x = log(gdpPercap), y = lifeExp)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3)) +
geom_smooth(method = "loess")model_colors <- RColorBrewer::brewer.pal(3, "Set1")
model_colors## [1] "#E41A1C" "#377EB8" "#4DAF4A"
ggplot(data = gapminder,
mapping = aes(x = log(gdpPercap), y = lifeExp)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS")) +
geom_smooth(method = "lm", formula = y ~ splines::bs(x, df = 3),
aes(color = "Cubic Spline", fill = "Cubic Spline")) +
geom_smooth(method = "loess",
aes(color = "LOESS", fill = "LOESS")) +
scale_color_manual(name = "Models", values = model_colors) +
scale_fill_manual(name = "Models", values = model_colors) +
theme(legend.position = "top")credit <- read_csv("data/Credit.csv") %>%
# remove first ID column
select(-X1)
names(credit) <- stringr::str_to_lower(names(credit)) # convert column names to lowercase
glimpse(credit)## Observations: 400
## Variables: 11
## $ income <dbl> 14.9, 106.0, 104.6, 148.9, 55.9, 80.2, 21.0, 71.4, 1...
## $ limit <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300...
## $ rating <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 58...
## $ cards <int> 2, 3, 4, 3, 2, 4, 2, 2, 5, 3, 4, 3, 1, 1, 2, 3, 3, 3...
## $ age <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, ...
## $ education <int> 11, 15, 11, 11, 16, 10, 12, 9, 13, 19, 14, 16, 7, 9,...
## $ gender <chr> "Male", "Female", "Male", "Female", "Male", "Male", ...
## $ student <chr> "No", "Yes", "No", "No", "No", "No", "No", "No", "No...
## $ married <chr> "Yes", "Yes", "No", "No", "Yes", "No", "No", "No", "...
## $ ethnicity <chr> "Caucasian", "Asian", "Asian", "Asian", "Caucasian",...
## $ balance <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, ...
pairs(select_if(credit, is.numeric))library(GGally)
ggpairs(select_if(credit, is.numeric))ggpairs(credit, mapping = aes(color = gender),
columns = c("income", "limit", "rating", "cards", "age", "education", "balance"))ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = "smooth"
)
)ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = wrap("smooth", alpha = .1, color = "blue")
)
)scatter_smooth <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
# make data points transparent
geom_point(alpha = .2) +
# add default smoother
geom_smooth(se = FALSE)
}
ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = scatter_smooth
)
)ggpairs(credit, mapping = aes(color = gender),
columns = c("income", "limit", "rating", "cards", "age", "education", "balance"),
lower = list(
continuous = scatter_smooth
)
)rcfss::scorecard %>%
select(type:debt) %>%
ggpairs(mpg_lite <- select_if(mpg, is.numeric))## # A tibble: 234 x 5
## displ year cyl cty hwy
## <dbl> <int> <int> <int> <int>
## 1 1.80 1999 4 18 29
## 2 1.80 1999 4 21 29
## 3 2.00 2008 4 20 31
## 4 2.00 2008 4 21 30
## 5 2.80 1999 6 16 26
## 6 2.80 1999 6 18 26
## 7 3.10 2008 6 18 27
## 8 1.80 1999 4 18 26
## 9 1.80 1999 4 16 25
## 10 2.00 2008 4 20 28
## # ... with 224 more rows
(cormat <- mpg_lite %>%
cor %>%
round(2))## displ year cyl cty hwy
## displ 1.00 0.15 0.93 -0.80 -0.77
## year 0.15 1.00 0.12 -0.04 0.00
## cyl 0.93 0.12 1.00 -0.81 -0.76
## cty -0.80 -0.04 -0.81 1.00 0.96
## hwy -0.77 0.00 -0.76 0.96 1.00
library(reshape2)
(melted_cormat <- melt(cormat))## Var1 Var2 value
## 1 displ displ 1.00
## 2 year displ 0.15
## 3 cyl displ 0.93
## 4 cty displ -0.80
## 5 hwy displ -0.77
## 6 displ year 0.15
## 7 year year 1.00
## 8 cyl year 0.12
## 9 cty year -0.04
## 10 hwy year 0.00
## 11 displ cyl 0.93
## 12 year cyl 0.12
## 13 cyl cyl 1.00
## 14 cty cyl -0.81
## 15 hwy cyl -0.76
## 16 displ cty -0.80
## 17 year cty -0.04
## 18 cyl cty -0.81
## 19 cty cty 1.00
## 20 hwy cty 0.96
## 21 displ hwy -0.77
## 22 year hwy 0.00
## 23 cyl hwy -0.76
## 24 cty hwy 0.96
## 25 hwy hwy 1.00
ggplot(melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
geom_tile()# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri## displ year cyl cty hwy
## displ 1 0.15 0.93 -0.80 -0.77
## year NA 1.00 0.12 -0.04 0.00
## cyl NA NA 1.00 -0.81 -0.76
## cty NA NA NA 1.00 0.96
## hwy NA NA NA NA 1.00
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1)) +
coord_fixed()reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap)ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.position = "bottom")cormat_heatmap <- function(data){
# generate correlation matrix
cormat <- round(cor(data), 2)
# melt into a tidy table
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
# reorder matrix based on coefficient value
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# add correlation values to graph
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.position = "bottom")
}
cormat_heatmap(select_if(mpg, is.numeric))cormat_heatmap(select_if(credit, is.numeric))cormat_heatmap(select_if(diamonds, is.numeric))ggplot(mpg, aes(displ, hwy, color = class)) +
geom_point()# import data
(vote <- rcfss::mental_health)## # A tibble: 1,317 x 5
## vote96 age educ female mhealth
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1. 60. 12. 0. 0.
## 2 1. 36. 12. 0. 1.
## 3 0. 21. 13. 0. 7.
## 4 0. 29. 13. 0. 6.
## 5 1. 39. 18. 1. 2.
## 6 1. 41. 15. 1. 1.
## 7 1. 48. 20. 0. 2.
## 8 0. 20. 12. 1. 9.
## 9 0. 27. 11. 1. 9.
## 10 0. 34. 7. 1. 2.
## # ... with 1,307 more rows
# estimate model
vote_glm <- glm(vote96 ~ age + educ, data = vote, family = "binomial")
tidy(vote_glm)## term estimate std.error statistic p.value
## 1 (Intercept) -5.0244 0.44482 -11.3 1.38e-29
## 2 age 0.0469 0.00441 10.6 1.94e-26
## 3 educ 0.2816 0.02629 10.7 9.32e-27
# extract predicted probabilities
vote_prob <- vote %>%
data_grid(age = 18:89, educ = 0:20) %>%
modelr::add_predictions(vote_glm) %>%
# convert predicted values to probabilities
mutate(prob = rcfss::logit2prob(pred))
ggplot(vote_prob, aes(age, educ, fill = prob)) +
geom_tile() +
scale_fill_gradient2(midpoint = .5, label = scales::percent) +
labs(title = "Probability of voter turnout in 1996",
x = "Age",
y = "Education (in years)",
fill = "Probability\nof voting")# cleaner image using geom_raster and interpolate = TRUE
ggplot(vote_prob, aes(age, educ, fill = prob)) +
geom_raster(interpolate = TRUE) +
scale_fill_gradient2(midpoint = .5, label = scales::percent) +
labs(title = "Probability of voter turnout in 1996",
x = "Age",
y = "Education (in years)",
fill = "Probability\nof voting")plotlyplot_ly(vote_prob, x = ~age, y = ~educ, z = ~prob) %>%
add_mesh()plotlyplot_ly(credit, x = ~limit, y = ~balance, z = ~income) %>%
add_mesh()plotlyplot_ly(z = ~volcano) %>% add_surface()volcano %>%
melt %>%
ggplot(aes(Var1, Var2, z = value)) +
geom_contour(aes(color = ..level..))# contingency table
library(descr)
crosstab(happy$happy, happy$sex, prop.t = TRUE, plot = FALSE)## Cell Contents
## |-------------------------|
## | Count |
## | Total Percent |
## |-------------------------|
##
## =======================================
## happy$sex
## happy$happy male female Total
## ---------------------------------------
## not too happy 1904 2460 4364
## 5.5% 7.1%
## ---------------------------------------
## pretty happy 8760 10539 19299
## 25.2% 30.3%
## ---------------------------------------
## very happy 4833 6327 11160
## 13.9% 18.2%
## ---------------------------------------
## Total 15497 19326 34823
## =======================================
crosstab(happy$happy, happy$sex, prop.r = TRUE, plot = FALSE)## Cell Contents
## |-------------------------|
## | Count |
## | Row Percent |
## |-------------------------|
##
## =======================================
## happy$sex
## happy$happy male female Total
## ---------------------------------------
## not too happy 1904 2460 4364
## 43.6% 56.4% 12.5%
## ---------------------------------------
## pretty happy 8760 10539 19299
## 45.4% 54.6% 55.4%
## ---------------------------------------
## very happy 4833 6327 11160
## 43.3% 56.7% 32.0%
## ---------------------------------------
## Total 15497 19326 34823
## =======================================
crosstab(happy$happy, happy$sex, prop.c = TRUE, plot = FALSE)## Cell Contents
## |-------------------------|
## | Count |
## | Column Percent |
## |-------------------------|
##
## =======================================
## happy$sex
## happy$happy male female Total
## ---------------------------------------
## not too happy 1904 2460 4364
## 12.3% 12.7%
## ---------------------------------------
## pretty happy 8760 10539 19299
## 56.5% 54.5%
## ---------------------------------------
## very happy 4833 6327 11160
## 31.2% 32.7%
## ---------------------------------------
## Total 15497 19326 34823
## 44.5% 55.5%
## =======================================
crosstab(happy$happy, happy$sex, prop.c = TRUE, prop.r = TRUE, plot = FALSE)## Cell Contents
## |-------------------------|
## | Count |
## | Row Percent |
## | Column Percent |
## |-------------------------|
##
## =======================================
## happy$sex
## happy$happy male female Total
## ---------------------------------------
## not too happy 1904 2460 4364
## 43.6% 56.4% 12.5%
## 12.3% 12.7%
## ---------------------------------------
## pretty happy 8760 10539 19299
## 45.4% 54.6% 55.4%
## 56.5% 54.5%
## ---------------------------------------
## very happy 4833 6327 11160
## 43.3% 56.7% 32.0%
## 31.2% 32.7%
## ---------------------------------------
## Total 15497 19326 34823
## 44.5% 55.5%
## =======================================
graphics::mosaicplot()mosaicplot(~ sex + happy, data = happy)vcd::mosaic()library(vcd)
mosaic(~ happy + sex, happy)productplots::prodplot()# mosaic plot using productplots
prodplot(happy, ~ happy + sex)# add color
prodplot(happy, ~ happy + sex) +
aes(fill = happy) +
theme(panel.grid = element_blank())prodplot(happy, ~ happy + marital) +
aes(fill = happy) +
theme(legend.position = "none") +
theme(panel.grid = element_blank())productplots::prodplot()ggplot(happy, aes(marital, fill = happy)) +
geom_bar(position = "fill")library(ISLR)
OJ_sum <- OJ %>%
select(ends_with("MM"), ends_with("CH")) %>%
gather(var, value) %>%
group_by(var) %>%
summarize(mean = mean(value),
sd = sd(value),
min = min(value),
max = max(value),
n = n())
# print the table
kable(OJ_sum)| var | mean | sd | min | max | n |
|---|---|---|---|---|---|
| DiscCH | 0.052 | 0.117 | 0.00 | 0.500 | 1070 |
| DiscMM | 0.123 | 0.214 | 0.00 | 0.800 | 1070 |
| LoyalCH | 0.566 | 0.308 | 0.00 | 1.000 | 1070 |
| PctDiscCH | 0.027 | 0.062 | 0.00 | 0.253 | 1070 |
| PctDiscMM | 0.059 | 0.102 | 0.00 | 0.402 | 1070 |
| PriceCH | 1.867 | 0.102 | 1.69 | 2.090 | 1070 |
| PriceMM | 2.085 | 0.134 | 1.69 | 2.290 | 1070 |
| SalePriceCH | 1.816 | 0.143 | 1.39 | 2.090 | 1070 |
| SalePriceMM | 1.962 | 0.253 | 1.19 | 2.290 | 1070 |
| SpecialCH | 0.148 | 0.355 | 0.00 | 1.000 | 1070 |
| SpecialMM | 0.162 | 0.368 | 0.00 | 1.000 | 1070 |
# plot using a single dot plot
ggplot(OJ_sum, aes(x = fct_reorder(var, mean), y = mean)) +
geom_linerange(aes(ymin = mean - 2 * sd,
ymax = mean + 2 * sd),
linetype = 2,
size = .25) +
geom_linerange(aes(ymin = mean - sd,
ymax = mean + sd),
size = 1) +
geom_point() +
coord_flip() +
labs(x = NULL,
y = NULL)# dodge based on OJ brand
OJ_sum %>%
separate(var, into = c("var", "brand"), -3, remove = TRUE) %>%
ggplot(aes(x = fct_reorder(var, mean), y = mean, color = brand)) +
geom_linerange(aes(ymin = mean - 2 * sd,
ymax = mean + 2 * sd),
linetype = 2,
size = .25,
position = position_dodge(width = 0.5)) +
geom_linerange(aes(ymin = mean - sd,
ymax = mean + sd),
size = 1,
position = position_dodge(width = 0.5)) +
geom_point(position = position_dodge(width = 0.5)) +
coord_flip() +
labs(x = NULL,
y = NULL,
color = "Brand")# facet based on OJ brand
OJ_sum %>%
separate(var, into = c("var", "brand"), -3, remove = TRUE) %>%
ggplot(aes(x = fct_reorder(var, mean), y = mean)) +
facet_grid(. ~ brand) +
geom_linerange(aes(ymin = mean - 2 * sd,
ymax = mean + 2 * sd),
linetype = 2,
size = .25) +
geom_linerange(aes(ymin = mean - sd,
ymax = mean + sd),
size = 1) +
geom_point() +
coord_flip() +
labs(x = NULL,
y = NULL,
color = "Brand")gapminder## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ... with 1,694 more rows
gapminder %>%
ggplot(aes(year, lifeExp, group = country)) +
geom_line(alpha = 1/3)gapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)##
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.95 -9.65 1.70 10.33 22.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -585.6522 32.3140 -18.1 <2e-16 ***
## year 0.3259 0.0163 20.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.6 on 1702 degrees of freedom
## Multiple R-squared: 0.19, Adjusted R-squared: 0.189
## F-statistic: 399 on 1 and 1702 DF, p-value: <2e-16
grid <- gapminder %>%
data_grid(year, country)
grid## # A tibble: 1,704 x 2
## year country
## <int> <fct>
## 1 1952 Afghanistan
## 2 1952 Albania
## 3 1952 Algeria
## 4 1952 Angola
## 5 1952 Argentina
## 6 1952 Australia
## 7 1952 Austria
## 8 1952 Bahrain
## 9 1952 Bangladesh
## 10 1952 Belgium
## # ... with 1,694 more rows
grid <- grid %>%
add_predictions(gapminder_mod)
grid## # A tibble: 1,704 x 3
## year country pred
## <int> <fct> <dbl>
## 1 1952 Afghanistan 50.5
## 2 1952 Albania 50.5
## 3 1952 Algeria 50.5
## 4 1952 Angola 50.5
## 5 1952 Argentina 50.5
## 6 1952 Australia 50.5
## 7 1952 Austria 50.5
## 8 1952 Bahrain 50.5
## 9 1952 Bangladesh 50.5
## 10 1952 Belgium 50.5
## # ... with 1,694 more rows
ggplot(gapminder, aes(year, group = country)) +
geom_line(aes(y = lifeExp), alpha = .2) +
geom_line(aes(y = pred), data = grid, color = "red", size = 1)str(gapminder_mod)## List of 12
## $ coefficients : Named num [1:2] -585.652 0.326
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "year"
## $ residuals : Named num [1:1704] -21.7 -21.8 -21.8 -21.4 -20.9 ...
## ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
## $ effects : Named num [1:1704] -2455.1 232.2 -20.8 -20.5 -20.2 ...
## ..- attr(*, "names")= chr [1:1704] "(Intercept)" "year" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:1704] 50.5 52.1 53.8 55.4 57 ...
## ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:1704, 1:2] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "year"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.02 1.03
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 1702
## $ xlevels : Named list()
## $ call : language lm(formula = lifeExp ~ year, data = gapminder)
## $ terms :Classes 'terms', 'formula' language lifeExp ~ year
## .. ..- attr(*, "variables")= language list(lifeExp, year)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "lifeExp" "year"
## .. .. .. ..$ : chr "year"
## .. ..- attr(*, "term.labels")= chr "year"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(lifeExp, year)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
## $ model :'data.frame': 1704 obs. of 2 variables:
## ..$ lifeExp: num [1:1704] 28.8 30.3 32 34 36.1 ...
## ..$ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language lifeExp ~ year
## .. .. ..- attr(*, "variables")= language list(lifeExp, year)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "lifeExp" "year"
## .. .. .. .. ..$ : chr "year"
## .. .. ..- attr(*, "term.labels")= chr "year"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(lifeExp, year)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
## - attr(*, "class")= chr "lm"
broom::tidy()tidy(gapminder_mod)## term estimate std.error statistic p.value
## 1 (Intercept) -585.652 32.3140 -18.1 2.90e-67
## 2 year 0.326 0.0163 20.0 7.55e-80
tidy(gapminder_mod) %>%
str()## 'data.frame': 2 obs. of 5 variables:
## $ term : chr "(Intercept)" "year"
## $ estimate : num -585.652 0.326
## $ std.error: num 32.314 0.0163
## $ statistic: num -18.1 20
## $ p.value : num 2.90e-67 7.55e-80
broom::augment()augment(gapminder_mod) %>%
as_tibble()## # A tibble: 1,704 x 9
## lifeExp year .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28.8 1952 50.5 0.530 -21.7 0.00208 11.6 3.63e-3 -1.87
## 2 30.3 1957 52.1 0.463 -21.8 0.00158 11.6 2.79e-3 -1.88
## 3 32.0 1962 53.8 0.401 -21.8 0.00119 11.6 2.09e-3 -1.87
## 4 34.0 1967 55.4 0.348 -21.4 0.000895 11.6 1.51e-3 -1.84
## 5 36.1 1972 57.0 0.307 -20.9 0.000698 11.6 1.13e-3 -1.80
## 6 38.4 1977 58.7 0.285 -20.2 0.000599 11.6 9.07e-4 -1.74
## 7 39.9 1982 60.3 0.285 -20.4 0.000599 11.6 9.26e-4 -1.76
## 8 40.8 1987 61.9 0.307 -21.1 0.000698 11.6 1.15e-3 -1.81
## 9 41.7 1992 63.5 0.348 -21.9 0.000895 11.6 1.59e-3 -1.88
## 10 41.8 1997 65.2 0.401 -23.4 0.00119 11.6 2.42e-3 -2.01
## # ... with 1,694 more rows
broom::glance()glance(gapminder_mod)## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## 1 0.19 0.189 11.6 399 7.55e-80 2 -6598 13202 13218
## deviance df.residual
## 1 230229 1702
stargazer## term estimate std.error statistic p.value
## 1 (Intercept) -585.652 32.3140 -18.1 2.90e-67
## 2 year 0.326 0.0163 20.0 7.55e-80
stargazerlibrary(stargazer)
stargazer(gapminder_mod, type = "html")| Dependent variable: | |
| lifeExp | |
| year | 0.326*** |
| (0.016) | |
| Constant | -586.000*** |
| (32.300) | |
| Observations | 1,704 |
| R2 | 0.190 |
| Adjusted R2 | 0.189 |
| Residual Std. Error | 11.600 (df = 1702) |
| F Statistic | 399.000*** (df = 1; 1702) |
| Note: | p<0.1; p<0.05; p<0.01 |
stargazergapminder_yr <- lm(lifeExp ~ year, data = gapminder)
gapminder_gdp <- lm(lifeExp ~ gdpPercap, data = gapminder)
gapminder_yr_gdp <- lm(lifeExp ~ year + gdpPercap, data = gapminder)
gapminder_gdp_year_lifeexp <- lm(gdpPercap ~ year + lifeExp, data = gapminder)
stargazer(gapminder_yr, gapminder_gdp, gapminder_yr_gdp, gapminder_gdp_year_lifeexp,
type = "html")| Dependent variable: | ||||
| lifeExp | gdpPercap | |||
| (1) | (2) | (3) | (4) | |
| year | 0.326*** | 0.239*** | -19.000 | |
| (0.016) | (0.014) | (12.500) | ||
| gdpPercap | 0.001*** | 0.001*** | ||
| (0.00003) | (0.00002) | |||
| lifeExp | 457.000*** | |||
| (16.700) | ||||
| Constant | -586.000*** | 54.000*** | -418.000*** | 17,658.000 |
| (32.300) | (0.315) | (27.600) | (24,287.000) | |
| Observations | 1,704 | 1,704 | 1,704 | 1,704 |
| R2 | 0.190 | 0.341 | 0.437 | 0.342 |
| Adjusted R2 | 0.189 | 0.340 | 0.437 | 0.341 |
| Residual Std. Error | 11.600 (df = 1702) | 10.500 (df = 1702) | 9.690 (df = 1701) | 8,003.000 (df = 1701) |
| F Statistic | 399.000*** (df = 1; 1702) | 880.000*** (df = 1; 1702) | 661.000*** (df = 2; 1701) | 441.000*** (df = 2; 1701) |
| Note: | p<0.1; p<0.05; p<0.01 | |||
stargazerstargazer(gapminder_yr, gapminder_gdp, gapminder_yr_gdp, gapminder_gdp_year_lifeexp,
type = "html",
dep.var.labels = c("Life expectancy", "GDP per capita"),
covariate.labels = c("Year", "GDP per capita", "Life expectancy"),
omit.stat = c("ser", "f"))| Dependent variable: | ||||
| Life expectancy | GDP per capita | |||
| (1) | (2) | (3) | (4) | |
| Year | 0.326*** | 0.239*** | -19.000 | |
| (0.016) | (0.014) | (12.500) | ||
| GDP per capita | 0.001*** | 0.001*** | ||
| (0.00003) | (0.00002) | |||
| Life expectancy | 457.000*** | |||
| (16.700) | ||||
| Constant | -586.000*** | 54.000*** | -418.000*** | 17,658.000 |
| (32.300) | (0.315) | (27.600) | (24,287.000) | |
| Observations | 1,704 | 1,704 | 1,704 | 1,704 |
| R2 | 0.190 | 0.341 | 0.437 | 0.342 |
| Adjusted R2 | 0.189 | 0.340 | 0.437 | 0.341 |
| Note: | p<0.1; p<0.05; p<0.01 | |||
out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
data = gapminder)
summary(out)##
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.16 -4.49 0.30 5.11 25.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.78e+01 3.40e-01 140.82 <2e-16 ***
## gdpPercap 4.50e-04 2.35e-05 19.16 <2e-16 ***
## pop 6.57e-09 1.98e-09 3.33 9e-04 ***
## continentAmericas 1.35e+01 6.00e-01 22.46 <2e-16 ***
## continentAsia 8.19e+00 5.71e-01 14.34 <2e-16 ***
## continentEurope 1.75e+01 6.25e-01 27.97 <2e-16 ***
## continentOceania 1.81e+01 1.78e+00 10.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.37 on 1697 degrees of freedom
## Multiple R-squared: 0.582, Adjusted R-squared: 0.581
## F-statistic: 394 on 6 and 1697 DF, p-value: <2e-16
min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)
pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp,
to = max_gdp,
length.out = 100)),
pop = med_pop,
continent = c("Africa", "Americas",
"Asia", "Europe", "Oceania")) %>%
as_tibble
pred_df## # A tibble: 500 x 3
## gdpPercap pop continent
## <dbl> <dbl> <fct>
## 1 241. 7023596. Africa
## 2 1385. 7023596. Africa
## 3 2530. 7023596. Africa
## 4 3674. 7023596. Africa
## 5 4818. 7023596. Africa
## 6 5962. 7023596. Africa
## 7 7107. 7023596. Africa
## 8 8251. 7023596. Africa
## 9 9395. 7023596. Africa
## 10 10540. 7023596. Africa
## # ... with 490 more rows
pred_out <- predict(object = out,
newdata = pred_df,
interval = "predict") %>%
as_tibble
pred_out## # A tibble: 500 x 3
## fit lwr upr
## <dbl> <dbl> <dbl>
## 1 48.0 31.5 64.4
## 2 48.5 32.1 64.9
## 3 49.0 32.6 65.4
## 4 49.5 33.1 65.9
## 5 50.0 33.6 66.4
## 6 50.5 34.1 67.0
## 7 51.1 34.6 67.5
## 8 51.6 35.1 68.0
## 9 52.1 35.7 68.5
## 10 52.6 36.2 69.0
## # ... with 490 more rows
pred_df <- bind_cols(pred_df, pred_out)
pred_df## # A tibble: 500 x 6
## gdpPercap pop continent fit lwr upr
## <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 241. 7023596. Africa 48.0 31.5 64.4
## 2 1385. 7023596. Africa 48.5 32.1 64.9
## 3 2530. 7023596. Africa 49.0 32.6 65.4
## 4 3674. 7023596. Africa 49.5 33.1 65.9
## 5 4818. 7023596. Africa 50.0 33.6 66.4
## 6 5962. 7023596. Africa 50.5 34.1 67.0
## 7 7107. 7023596. Africa 51.1 34.6 67.5
## 8 8251. 7023596. Africa 51.6 35.1 68.0
## 9 9395. 7023596. Africa 52.1 35.7 68.5
## 10 10540. 7023596. Africa 52.6 36.2 69.0
## # ... with 490 more rows
ggplot(data = pred_df %>%
filter(continent %in% c("Europe", "Africa")),
aes(x = gdpPercap,
y = fit, ymin = lwr, ymax = upr,
color = continent,
fill = continent,
group = continent)) +
geom_point(data = gapminder %>%
filter(continent %in% c("Europe", "Africa")),
aes(x = gdpPercap, y = lifeExp,
color = continent),
alpha = 0.5,
inherit.aes = FALSE) +
geom_line() +
geom_ribbon(alpha = 0.2, color = FALSE) +
scale_x_log10(labels = scales::dollar)library(margins)library(socviz)
glimpse(gss_sm)## Observations: 2,867
## Variables: 32
## $ year <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ ballot <dbl+lbl> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3...
## $ age <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32...
## $ childs <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6,...
## $ sibs <dbl+lbl> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1...
## $ degree <fct> Bachelor, High School, Bachelor, High School, Grad...
## $ race <fct> White, White, White, White, White, White, White, O...
## $ sex <fct> Male, Male, Male, Female, Female, Female, Male, Fe...
## $ region <fct> New England, New England, New England, New England...
## $ income16 <fct> $170000 or over, $50000 to 59999, $75000 to $89999...
## $ relig <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ marital <fct> Married, Never Married, Married, Married, Married,...
## $ padeg <fct> Graduate, Lt High School, High School, NA, Bachelo...
## $ madeg <fct> High School, High School, Lt High School, High Sch...
## $ partyid <fct> Independent, Ind,near Dem, Not Str Republican, Not...
## $ polviews <fct> Moderate, Liberal, Conservative, Moderate, Slightl...
## $ happy <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Hap...
## $ partners <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner...
## $ grass <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Le...
## $ zodiac <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpi...
## $ pres12 <dbl+lbl> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1,...
## $ wtssall <dbl> 0.957, 0.478, 0.957, 1.914, 1.435, 0.957, 1.435, 0...
## $ income_rc <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $...
## $ agegrp <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-5...
## $ ageq <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-6...
## $ siblings <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, ...
## $ kids <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+,...
## $ religion <fct> None, None, Catholic, Catholic, None, None, None, ...
## $ bigregion <fct> Northeast, Northeast, Northeast, Northeast, Northe...
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0,...
## $ obama <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, ...
# make moderates the reference category (i.e. 0)
gss_sm$polviews_m <- relevel(gss_sm$polviews, ref = "Moderate")
# build the model
out_bo <- glm(obama ~ polviews_m + sex*race,
family = "binomial", data = gss_sm)
summary(out_bo)##
## Call:
## glm(formula = obama ~ polviews_m + sex * race, family = "binomial",
## data = gss_sm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.905 -0.554 0.177 0.542 2.244
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29649 0.13409 2.21 0.0270 *
## polviews_mExtremely Liberal 2.37295 0.52504 4.52 6.2e-06 ***
## polviews_mLiberal 2.60003 0.35667 7.29 3.1e-13 ***
## polviews_mSlightly Liberal 1.29317 0.24843 5.21 1.9e-07 ***
## polviews_mSlightly Conservative -1.35528 0.18129 -7.48 7.7e-14 ***
## polviews_mConservative -2.34746 0.20038 -11.71 < 2e-16 ***
## polviews_mExtremely Conservative -2.72738 0.38721 -7.04 1.9e-12 ***
## sexFemale 0.25487 0.14537 1.75 0.0796 .
## raceBlack 3.84953 0.50132 7.68 1.6e-14 ***
## raceOther -0.00214 0.43576 0.00 0.9961
## sexFemale:raceBlack -0.19751 0.66007 -0.30 0.7648
## sexFemale:raceOther 1.57483 0.58766 2.68 0.0074 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2247.9 on 1697 degrees of freedom
## Residual deviance: 1345.9 on 1686 degrees of freedom
## (1169 observations deleted due to missingness)
## AIC: 1370
##
## Number of Fisher Scoring iterations: 6
bo_m <- margins(out_bo)
summary(bo_m)## factor AME SE z p lower
## polviews_mConservative -0.4119 0.0283 -14.5394 0.0000 -0.4674
## polviews_mExtremely Conservative -0.4538 0.0420 -10.7971 0.0000 -0.5361
## polviews_mExtremely Liberal 0.2681 0.0295 9.0996 0.0000 0.2103
## polviews_mLiberal 0.2768 0.0229 12.0736 0.0000 0.2319
## polviews_mSlightly Conservative -0.2658 0.0330 -8.0596 0.0000 -0.3304
## polviews_mSlightly Liberal 0.1933 0.0303 6.3896 0.0000 0.1340
## raceBlack 0.4032 0.0173 23.3568 0.0000 0.3694
## raceOther 0.1247 0.0386 3.2297 0.0012 0.0490
## sexFemale 0.0443 0.0177 2.5073 0.0122 0.0097
## upper
## -0.3564
## -0.3714
## 0.3258
## 0.3218
## -0.2011
## 0.2526
## 0.4371
## 0.2005
## 0.0789
bo_gg <- as_tibble(summary(bo_m))
prefixes <- c("polviews_m", "sex")
bo_gg$factor <- prefix_strip(bo_gg$factor, prefixes)
bo_gg$factor <- prefix_replace(bo_gg$factor, "race", "Race: ")
bo_gg %>%
select(factor, AME, lower, upper) ## # A tibble: 9 x 4
## factor AME lower upper
## * <chr> <dbl> <dbl> <dbl>
## 1 Conservative -0.412 -0.467 -0.356
## 2 Extremely Conservative -0.454 -0.536 -0.371
## 3 Extremely Liberal 0.268 0.210 0.326
## 4 Liberal 0.277 0.232 0.322
## 5 Slightly Conservative -0.266 -0.330 -0.201
## 6 Slightly Liberal 0.193 0.134 0.253
## 7 Race: Black 0.403 0.369 0.437
## 8 Race: Other 0.125 0.0490 0.200
## 9 Female 0.0443 0.00967 0.0789
ggplot(data = bo_gg, aes(x = reorder(factor, AME),
y = AME, ymin = lower, ymax = upper)) +
geom_hline(yintercept = 0, color = "gray80") +
geom_pointrange() + coord_flip() +
labs(x = NULL, y = "Average Marginal Effect")